Loading libraries
library(dplyr)
library(ggplot2)
library(plotly)
Reading data
data <- read.csv2('./all_summary.csv', nrows = 10000)
dim(data)
## [1] 10000 412
Deleting chosen ligands
deletable_res_name <- c("UNK", "UNX", "UNL", "DUM", "N", "BLOB", "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE", "LEU", "LYS", "MET", "MSE", "PHE", "PRO", "SEC", "SER", "THR", "TRP", "TYR", "VAL", "DA", "DG", "DT", "DC", "DU", "A", "G", "T", "C", "U", "HOH", "H20", "WAT")
data <- data[!data$res_name %in% deletable_res_name,]
dim(data)
## [1] 9940 412
Processing missing data
#data <- data[complete.cases(data), ]
#dim(data)
Data summary
#knitr::kable(summary(data))
dim(data)
## [1] 9940 412
50 most popular ligands
# finding 50 most popular res_names
popular_res_names <- data %>% group_by(res_name) %>% summarise(n = n()) %>% arrange(-n) %>% slice(1:50)
# converting data frame format
popular_res_names <- as.list(popular_res_names[,1])$res_name
data <- data[data$res_name %in% popular_res_names,]
dim(data)
## [1] 6789 412
Cardinality of ligands
count_by_name <- data %>% group_by(res_name) %>% summarise(cardinality = n()) %>% arrange(-cardinality)
count_by_name
## # A tibble: 50 x 2
## res_name cardinality
## <fct> <int>
## 1 SO4 1007
## 2 GOL 632
## 3 EDO 516
## 4 NAG 464
## 5 CL 387
## 6 DMS 340
## 7 ZN 323
## 8 CA 284
## 9 HEM 260
## 10 MG 206
## # ... with 40 more rows
plot <- ggplot(count_by_name, aes(x = reorder(res_name, -cardinality), y = cardinality, fill = cardinality)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90)) +
xlab("ligand")+
labs(title = "Cardinality of ligands by name")
ggplotly(plot)